library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6      ✔ purrr   0.3.4 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.0      ✔ stringr 1.4.1 
## ✔ readr   2.1.2      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
data1 <- read_csv("Stats VVS Climate.csv")
## Rows: 5126 Columns: 46
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): Country name
## dbl (45): year, Land area below 5m (% of land area), Agricultural land under...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data1
data2 <- read_csv("Final_Dataset.csv")
## Rows: 5771 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (3): Country, SO2_emissions_per_capita, NOx_emissions_per_capita
## dbl (17): Year, X.SO2_change_since_1990, Total_SO2_Emissions, X.CO2_change_s...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data2
data2 <- data2 %>% 
  mutate(Country = replace(Country, Country == "United States of America", "United States")) %>%
  mutate(Country = replace(Country, Country == "United Kingdom of Great Britain and Northern Ireland", "United Kingdom"))

merged_data <- inner_join(data1, data2, by = c("Country name" = "Country", "year" = "Year"))
merged_data
temperatures <- read_csv("temp_country_2013.csv")
## Rows: 150822 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): Country
## dbl  (3): Month, Year, Temp
## date (1): Date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# merged_data %>% group_by(year) %>% select(2:ncol(.)) %>% summarise(is.na(cur_data))
na_fraction <- merged_data %>% mutate(across(3: ncol(.), ~ is.na(merged_data[[cur_column()]]))) %>%
  group_by(year) %>% summarise(across(3: ncol(.)-1, mean))
na_fraction_yearwise <- na_fraction%>% 
  rowwise() %>% mutate(na_per = mean(c_across(3:ncol(.)))) %>%
  select(year, na_per, everything())
na_fraction_yearwise
na_fraction_varwise <- na_fraction %>%
  summarise(across(3:ncol(.), mean))
na_fraction_varwise
# Fraction of na values in each year
na_fraction_yearwise %>% ggplot(aes(year, na_per)) + geom_col()

# Fraction of na values in each variable
na_fraction_varwise %>% pivot_longer(everything(), names_to = 'var', values_to = "na_per") %>%
  ggplot(aes(var, na_per)) + geom_col() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1, size=5))

var_names = c("CH4_emissions_per_capita", 
              "CO2_emissions_per_capita", 
              #"GDP ($)", 
              "GHG_emissions_per_capita",
              "N2O_emissions_per_capita",
              "NOx_emissions_per_capita",
              "Population",
              "Under-five mortality rate (per 1,000)")

#merged_data %>% select(all_of(var_names)) %>% 
#  mutate(NOx_emissions_per_capita = parse_double(NOx_emissions_per_capita)) %>%
#  #mutate_at(var_names, ~(scale(.) %>% as.vector)) %>%
#  pivot_longer(everything(), names_to = 'var', values_to = "values") %>%
#  ggplot(aes(x=var, y=values)) + geom_boxplot() + 
#  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1, size=10))

vars_with_littleNAs <- merged_data %>% select(all_of(var_names)) %>% 
  mutate(NOx_emissions_per_capita = parse_double(NOx_emissions_per_capita))
vars_with_littleNAs
mean1 <- vars_with_littleNAs %>% summarise(across(everything(), ~ mean(.x, na.rm=TRUE)))
mean1
std1 <- vars_with_littleNAs %>% summarise(across(everything(), ~ sd(.x, na.rm=TRUE)))
std1
vars_with_littleNAs_norm <- vars_with_littleNAs %>%
  mutate(across(everything(), ~ (.x - mean1[[cur_column()]])/std1[[cur_column()]]))
vars_with_littleNAs_norm
# box plot for some variables that have lesser NAs in them
vars_with_littleNAs_norm %>%
  pivot_longer(everything(), names_to = 'var', values_to = "values") %>%
  filter(!is.na(values)) %>%
  ggplot(aes(x=var, y=values)) + geom_boxplot() + 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1, size=10))

# vars_with_littleNAs_norm %>% summarise(min(`Population`, na.rm = TRUE))
# vars_with_littleNAs %>% summarise(min(`Population`, na.rm = TRUE))
#  mutate_at(var_names, ~(scale(.) %>% as.vector)) %>% summarise(across(var_names, mean))
vars_with_littleNAs_norm %>%
  pivot_longer(everything(), names_to = 'var', values_to = "values") %>%
  filter(!is.na(values) & var == "CO2_emissions_per_capita") %>% summarise(mean(values), sd(values))
vars_with_littleNAs %>%
  pivot_longer(everything(), names_to = 'var', values_to = "values") %>%
  filter(!is.na(values) & var == "CO2_emissions_per_capita") %>% summarise(mean(values), sd(values))
country_names = c("India",
                  "United States",
                  "United Kingdom",
                  "Canada",
                  "Switzerland")
vars_with_littleNAs_country <- merged_data %>% 
  select(all_of(c("Country name", "year", var_names))) %>% 
  filter(`Country name` %in% country_names) %>%
  mutate(NOx_emissions_per_capita = parse_double(NOx_emissions_per_capita))
vars_with_littleNAs_country 
bar_plot_data <- vars_with_littleNAs_country %>% group_by(`Country name`) %>%
  summarise(across(3:ncol(.)-1, ~ mean(.x, na.rm = TRUE)), .groups = "drop") %>% 
  pivot_longer(2:ncol(.), names_to = "var", values_to = "values") %>%
  group_by(var) %>% mutate(values = values/max(values, na.rm = TRUE))
bar_plot_data
# bar plot comparing some variable values among a few countries
bar_plot_data %>%
  ggplot(aes(fill=`Country name`, y = values, x = var)) +
  geom_bar(position = "dodge", stat = "identity") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1, size=10))
## Warning: Removed 2 rows containing missing values (geom_bar).

temperatures_year_wise <- temperatures %>% group_by(Country, Year) %>%
  summarise(Temp = mean(Temp), .groups = "drop") %>%
  mutate(Country = replace(Country, Country == "France (Europe)", "France"))

temperatures_year_wise
merged_data_with_temp <- inner_join(merged_data, temperatures_year_wise, by = c("Country name" = "Country", "year" = "Year"))
merged_data_with_temp
unique(merged_data$`Country name`)
##   [1] "Afghanistan"              "Angola"                  
##   [3] "Albania"                  "United Arab Emirates"    
##   [5] "Argentina"                "Armenia"                 
##   [7] "Antigua and Barbuda"      "Australia"               
##   [9] "Austria"                  "Azerbaijan"              
##  [11] "Burundi"                  "Belgium"                 
##  [13] "Benin"                    "Burkina Faso"            
##  [15] "Bangladesh"               "Bulgaria"                
##  [17] "Bahrain"                  "Bosnia and Herzegovina"  
##  [19] "Belarus"                  "Belize"                  
##  [21] "Brazil"                   "Barbados"                
##  [23] "Brunei Darussalam"        "Bhutan"                  
##  [25] "Botswana"                 "Central African Republic"
##  [27] "Canada"                   "Switzerland"             
##  [29] "Chile"                    "China"                   
##  [31] "Cameroon"                 "Cook Islands"            
##  [33] "Colombia"                 "Comoros"                 
##  [35] "Costa Rica"               "Cuba"                    
##  [37] "Cyprus"                   "Germany"                 
##  [39] "Djibouti"                 "Dominica"                
##  [41] "Denmark"                  "Dominican Republic"      
##  [43] "Algeria"                  "Ecuador"                 
##  [45] "Eritrea"                  "Spain"                   
##  [47] "Estonia"                  "Ethiopia"                
##  [49] "Finland"                  "Fiji"                    
##  [51] "France"                   "Gabon"                   
##  [53] "United Kingdom"           "Georgia"                 
##  [55] "Ghana"                    "Guinea"                  
##  [57] "Guinea-Bissau"            "Greece"                  
##  [59] "Grenada"                  "Guatemala"               
##  [61] "Guyana"                   "Honduras"                
##  [63] "Croatia"                  "Haiti"                   
##  [65] "Hungary"                  "Indonesia"               
##  [67] "India"                    "Ireland"                 
##  [69] "Iraq"                     "Iceland"                 
##  [71] "Israel"                   "Italy"                   
##  [73] "Jamaica"                  "Jordan"                  
##  [75] "Japan"                    "Kazakhstan"              
##  [77] "Kenya"                    "Cambodia"                
##  [79] "Kiribati"                 "Kuwait"                  
##  [81] "Lebanon"                  "Liberia"                 
##  [83] "Liechtenstein"            "Sri Lanka"               
##  [85] "Lesotho"                  "Lithuania"               
##  [87] "Luxembourg"               "Latvia"                  
##  [89] "Morocco"                  "Monaco"                  
##  [91] "Madagascar"               "Maldives"                
##  [93] "Mexico"                   "Marshall Islands"        
##  [95] "Mali"                     "Malta"                   
##  [97] "Myanmar"                  "Montenegro"              
##  [99] "Mongolia"                 "Mozambique"              
## [101] "Mauritania"               "Mauritius"               
## [103] "Malawi"                   "Malaysia"                
## [105] "Namibia"                  "Niger"                   
## [107] "Nigeria"                  "Nicaragua"               
## [109] "Niue"                     "Netherlands"             
## [111] "Norway"                   "Nepal"                   
## [113] "Nauru"                    "New Zealand"             
## [115] "Oman"                     "Pakistan"                
## [117] "Panama"                   "Peru"                    
## [119] "Philippines"              "Palau"                   
## [121] "Papua New Guinea"         "Poland"                  
## [123] "Portugal"                 "Paraguay"                
## [125] "Qatar"                    "Romania"                 
## [127] "Russian Federation"       "Rwanda"                  
## [129] "Saudi Arabia"             "Sudan"                   
## [131] "Senegal"                  "Singapore"               
## [133] "Solomon Islands"          "Sierra Leone"            
## [135] "El Salvador"              "San Marino"              
## [137] "Serbia"                   "Sao Tome and Principe"   
## [139] "Suriname"                 "Slovenia"                
## [141] "Sweden"                   "Seychelles"              
## [143] "Syrian Arab Republic"     "Chad"                    
## [145] "Togo"                     "Thailand"                
## [147] "Tajikistan"               "Turkmenistan"            
## [149] "Timor-Leste"              "Tonga"                   
## [151] "Trinidad and Tobago"      "Tunisia"                 
## [153] "Turkey"                   "Tuvalu"                  
## [155] "Uganda"                   "Ukraine"                 
## [157] "Uruguay"                  "United States"           
## [159] "Uzbekistan"               "Vanuatu"                 
## [161] "Samoa"                    "South Africa"            
## [163] "Zambia"                   "Zimbabwe"
colnames(merged_data_with_temp)
##  [1] "Country name"                                                 
##  [2] "year"                                                         
##  [3] "Land area below 5m (% of land area)"                          
##  [4] "Agricultural land under irrigation (% of total ag. land)"     
##  [5] "Cereal yield (kg per hectare)"                                
##  [6] "Foreign direct investment, net inflows (% of GDP)"            
##  [7] "Access to electricity (% of total population)"                
##  [8] "Energy use per units of GDP (kg oil eq./$1,000 of 2005 PPP $)"
##  [9] "Energy use per capita (kilograms of oil equivalent)"          
## [10] "CO2 emissions, total (KtCO2)"                                 
## [11] "CO2 emissions per capita (metric tons)"                       
## [12] "CO2 emissions per units of GDP (kg/$1,000 of 2005 PPP $)"     
## [13] "Other GHG emissions, total (KtCO2e)"                          
## [14] "Methane (CH4) emissions, total (KtCO2e)"                      
## [15] "Nitrous oxide (N2O) emissions, total (KtCO2e)"                
## [16] "Disaster risk reduction progress score (1-5 scale; 5=best)"   
## [17] "GHG net emissions/removals by LUCF (MtCO2e)"                  
## [18] "Average annual precipitation (1961-1990, mm)"                 
## [19] "Droughts, floods, extreme temps (% pop. avg. 1990-2009)"      
## [20] "Population below 5m (% of total)"                             
## [21] "Population in urban agglomerations >1million (%)"             
## [22] "Annual freshwater withdrawals (% of internal resources)"      
## [23] "Nationally terrestrial protected areas (% of total land area)"
## [24] "Ease of doing business (ranking 1-183; 1=best)"               
## [25] "Invest. in energy w/ private participation ($)"               
## [26] "Invest. in telecoms w/ private participation ($)"             
## [27] "Invest. in transport w/ private participation ($)"            
## [28] "Invest. in water/sanit. w/ private participation ($)"         
## [29] "Public sector mgmt & institutions avg. (1-6 scale; 6=best)"   
## [30] "Paved roads (% of total roads)"                               
## [31] "GDP ($)"                                                      
## [32] "GNI per capita (Atlas $)"                                     
## [33] "Ratio of girls to boys in primary & secondary school (%)"     
## [34] "Primary completion rate, total (% of relevant age group)"     
## [35] "Under-five mortality rate (per 1,000)"                        
## [36] "Access to improved water source (% of total pop.)"            
## [37] "Nurses and midwives (per 1,000 people)"                       
## [38] "Physicians (per 1,000 people)"                                
## [39] "Malaria incidence rate (per 100,000 people)"                  
## [40] "Access to improved sanitation (% of total pop.)"              
## [41] "Child malnutrition, underweight (% of under age 5)"           
## [42] "Population living below $1.25 a day (% of total)"             
## [43] "Population growth (annual %)"                                 
## [44] "Population"                                                   
## [45] "Urban population growth (annual %)"                           
## [46] "Urban population"                                             
## [47] "X.SO2_change_since_1990"                                      
## [48] "SO2_emissions_per_capita"                                     
## [49] "Total_SO2_Emissions"                                          
## [50] "X.CO2_change_since_1990"                                      
## [51] "CO2_emissions_per_capita"                                     
## [52] "Total_CO2_Emissions"                                          
## [53] "X.GHG_change_since_1990"                                      
## [54] "GHG_emissions_per_capita"                                     
## [55] "Total_GHG_Emissions"                                          
## [56] "X.NO2_change_since_1990"                                      
## [57] "N2O_emissions_per_capita"                                     
## [58] "Total_N2O_Emissions"                                          
## [59] "X.Nox_change_since_1990"                                      
## [60] "NOx_emissions_per_capita"                                     
## [61] "Total_NOx_Emissions"                                          
## [62] "X.CH4_change_since_1990"                                      
## [63] "CH4_emissions_per_capita"                                     
## [64] "Total_CH4_Emissions"                                          
## [65] "Temp"
country_names = c("India",
                  "United States",
                  "United Kingdom",
                  "France",
                  "Switzerland")

# variation if temperature and some other variables with years
merged_data_with_temp %>% #select(3:ncol(.)) %>%
  mutate(across(3:ncol(.), as.double)) %>% 
  filter(`Country name` %in% country_names) %>%
  ggplot(aes(year, Temp, color = `Country name`)) + geom_line()

merged_data_with_temp %>% #select(3:ncol(.)) %>%
  mutate(across(3:ncol(.), as.double)) %>% 
  filter(`Country name` %in% country_names) %>%
  ggplot(aes(year, `Cereal yield (kg per hectare)`, color = `Country name`)) + geom_line()
## Warning: Removed 12 row(s) containing missing values (geom_path).

merged_data_with_temp %>% #select(3:ncol(.)) %>%
  mutate(across(3:ncol(.), as.double)) %>% 
  filter(`Country name` %in% country_names) %>%
  ggplot(aes(year, `CO2 emissions, total (KtCO2)`, color = `Country name`)) + geom_line()
## Warning: Removed 18 row(s) containing missing values (geom_path).

merged_data_with_temp %>% #select(3:ncol(.)) %>%
  mutate(across(3:ncol(.), as.double)) %>% 
  filter(`Country name` %in% country_names) %>%
  ggplot(aes(year, `Urban population growth (annual %)`, color = `Country name`)) + geom_line()
## Warning: Removed 6 row(s) containing missing values (geom_path).

merged_data_with_temp %>% #select(3:ncol(.)) %>%
  mutate(across(3:ncol(.), as.double)) %>% 
  filter(`Country name` %in% country_names) %>%
  ggplot(aes(year, `GDP ($)`, color = `Country name`)) + geom_line()
## Warning: Removed 6 row(s) containing missing values (geom_path).